Python 提取word 文件中的脚本

#Python #提取word #文件中的脚本

请写一个 python 程序，提取word 文档中的的bash 脚本与 SQL 脚本。
1、宋体“小二”标题，用 “-” 包围例如这样：并在后面添加两个回车。
--------------------------------------------------------------------
-----------------  LAB01-安装说明  -------------------------------
--------------------------------------------------------------------
2、提取宋体“四号”标题，在前面添加“--”例如这样：并在后面添加两个回车。
--一、添加主机名解析
3、宋体五号标题，在前面添加“--”，例如这样：请注意只提取有“1、”这样标号的文本，其他文字信息不提取。
--1、关闭并禁用防火墙。
4、bash 脚本与 SQL 脚本 存放在 一行一列的表格中。请只提取一行一列的表格内容。其他表格的内容不提取。
5、请注意标题与脚本的顺序保持原样。
6、扫描当前目录下的多个word 文件，并提取出来的文本写入到 与当前目录下word 文件中一样的名字，后缀名为“.sql”

import os
import glob
from docx import Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import Table
import re


def get_font_size(paragraph):
    """获取段落的字体大小(磅值)"""
    if paragraph.runs:
        for run in paragraph.runs:
            if run.font.size:
                return run.font.size.pt
    return None


def is_chinese_heading(text):
    """判断是否为中文序号格式（如：一、二、三、）"""
    pattern = r'^[一二三四五六七八九十]+、'
    return bool(re.match(pattern, text.strip()))


def has_number_prefix(text):
    """判断是否以数字标号开头（如"1、"、"2、"等）"""
    text_stripped = text.strip()
    patterns = [
        r'^\d+[、.]',  # 1、 或 1.
#        r'^\(\d+\)',  # (1) 或 (2)
#        r'^（\d+）',  # （1）或（2）
    ]

    for pattern in patterns:
        if re.match(pattern, text_stripped):
            return True
    return False


def extract_scripts_from_docx(docx_path):
    """从Word文档中提取脚本和标题"""
    doc = Document(docx_path)
    output_lines = []

    for element in doc.element.body:
        if isinstance(element, CT_P):  # 段落
            para_obj = None
            for para in doc.paragraphs:
                if para._element == element:
                    para_obj = para
                    break

            if para_obj:
                text = para_obj.text.strip()
                if not text:
                    continue

                font_size = get_font_size(para_obj)

                # 小二标题 (约18磅) - 用长"-"包围
                if font_size and 17 <= font_size <= 22:
                    separator = "-" * 70
                    title_line = f"-----------------  {text}  -------------------------------"
                    output_lines.append(f"{separator}\n")
                    output_lines.append(f"{title_line}\n")
                    output_lines.append(f"{separator}\n\n")

                # 四号标题 - 中文序号格式（一、二、三、）
                elif is_chinese_heading(text):
                    output_lines.append(f"--{text}\n\n")

                # 五号标题 - 只提取带数字标号的（1、2、3、）
                elif has_number_prefix(text):
                    output_lines.append(f"--{text}\n")

        elif isinstance(element, CT_Tbl):  # 表格
            table = Table(element, doc)

            # 只处理一行一列的表格
            if len(table.rows) == 1 and len(table.columns) == 1:
                cell_text = table.cell(0, 0).text

                # 保留表格中的所有内容
                if cell_text.strip():
                    lines = cell_text.split('\n')
                    for line in lines:
                        if line.strip():  # 只排除完全空白的行
                            output_lines.append(line + '\n')
                    output_lines.append('\n')  # 表格结束后添加空行

    return output_lines


def process_word_file(docx_path):
    """处理单个Word文件并生成.sql文件"""
    if not os.path.exists(docx_path):
        print(f"  ❌ 文件不存在: {docx_path}")
        return False

    try:
        output_lines = extract_scripts_from_docx(docx_path)

        base_name = os.path.splitext(docx_path)[0]
        output_path = f"{base_name}.sql"

        with open(output_path, 'w', encoding='utf-8') as f:
            f.writelines(output_lines)

        print(f"  ✓ 提取完成: {os.path.basename(output_path)} (共 {len(output_lines)} 行)")
        return True

    except Exception as e:
        print(f"  ❌ 处理失败: {os.path.basename(docx_path)} - 错误: {str(e)}")
        return False


def process_multiple_word_files():
    """批量处理多个Word文档（默认当前目录，*.docx）"""
    directory = os.getcwd()
    file_pattern = "*.docx"

    # 构建搜索路径
    search_path = os.path.join(directory, file_pattern)
    docx_files = glob.glob(search_path)

    # 过滤掉临时文件（以~$开头）
    docx_files = [f for f in docx_files if not os.path.basename(f).startswith('~)]

    if not docx_files:
        print(f"❌ 当前目录下未找到.docx文件")
        return

    print(f"\n📁 当前目录: {directory}")
    print(f"📁 找到 {len(docx_files)} 个Word文档:")
    for f in docx_files:
        print(f"  - {os.path.basename(f)}")

    print(f"\n🔧 开始处理...\n")

    success_count = 0
    fail_count = 0

    for idx, docx_file in enumerate(docx_files, 1):
        print(f"[{idx}/{len(docx_files)}] 处理: {os.path.basename(docx_file)}")
        if process_word_file(docx_file):
            success_count += 1
        else:
            fail_count += 1

    print("\n" + "=" * 60)
    print(f"✅ 处理完成!")
    print(f"   成功: {success_count} 个文件")
    if fail_count > 0:
        print(f"   失败: {fail_count} 个文件")
    print("=" * 60)


if __name__ == "__main__":
    process_multiple_word_files()